#############
#############
#############
## Stefan Müller and Sven-Oliver Proksch:
## Nostalgia in European Party Politics:
## A Text-Based Measurement Approach
## British Journal of Political Science
##
## Script loads the translated corpus
## applies the dictionaries and classify sentences based on SVM.
## It also loads and merges the DistilBERT classification
## which was conducted in Python
## Script also return term frequency analysis, keyness analysis,
## and comparison of nostalgia across economic, culture, and other areas
## Outputs: Table A5, Table A6, Table A7, Table A15
## Figure A12, Figure A13
#############
#############
#############

# Load packages
library(quanteda)            # CRAN v3.3.1
library(quanteda.textstats)  # CRAN v0.96.3
library(quanteda.textmodels) # CRAN v0.9.6
library(dplyr)               # CRAN v1.1.2
library(ggplot2)             # CRAN v3.4.2
library(forcats)             # CRAN v1.0.0
library(stringr)             # CRAN v1.5.0
library(rio)                 # CRAN v0.5.29
library(lubridate)           # CRAN v1.9.2
library(xtable)              # CRAN v1.8-4
library(haven)               # CRAN v2.5.3
library(tidyr)               # CRAN v1.3.0


# If the code does not run, one or more packages may have been 
# updated, which may result in errors or conflicts. You can solve this issue
# by installing the package version listed above or by using the 
# groundhog package:
# after installing groundhog using install.packages("groundhog")
# change library(name_of_package) to
# groundhog::groundhog.library(name_of_package, date = "2023-09-04")
# Instead of adjusting the library() function for each package, 
# you can adjust them at all once using the
# the following syntax:
# groundhog.library("
#                   library('pkgA')
#                   library('pkgB')
#                   library('pkgC')", date = "2023-09-04")
# More details are available at: https://groundhogr.com/using/

# set working directory, either by using the here() package,
# setwd() or by creating an RProj file

# load custom ggplot2 theme
source("function_theme_base.R")

# load translated corpus (based on several spreadsheets
# which have been translated on the (quasi-)sentence level using Google Translate
dat_raw <- readRDS("data_corpus_translated_full.rds")

# remove empty texts
dat_raw <- dat_raw |>
    filter(text != "") |>
    filter(!is.na(text))

# select variables and save for PolNos dataset V1
# dat_raw_select <- dat_raw |>
#     select(
#         manifesto_id, type, text,
#         doc_id, countryname, party,
#         edate,
#         pos, cmp_code
#     )
# saveRDS(dat_raw_select, "data_translated_full.rds")


# create an ID for each manifesto
dat_raw <- dat_raw |>
    group_by(manifesto_id) |>
    mutate(doc_id = paste(manifesto_id, 1:n(), sep = "_")) |>
    ungroup()


# create quanteda text corpus
corp_full <- corpus(dat_raw, text_field = "text")

# count number of words (excluding punctuation)
corp_full$ntoken_en <- ntoken(corp_full,
                              remove_punct = TRUE
)

# assign variable with number of tokens to data frame
dat_raw$ntoken_en <- corp_full$ntoken_en

# only keep sentences longer than 5 words and shorter than 60 words
# which aims to remove very short (incomplete) or long sentences
dat_full <- dat_raw |>
    filter(ntoken_en > 5) |>
    filter(ntoken_en < 60)

# number of sentences in original and filtered file
nrow(dat_raw)
nrow(dat_full)

# > nrow(dat_raw)
                             # [1] 1439551
# > nrow(dat_full)
                             # [1] 1192679


# size of final corpus (compared to raw corpus)
nrow(dat_full) / nrow(dat_raw)

# minimum and maximum number of tokens (should be between 6 and 59)
min(dat_full$ntoken_en)
max(dat_full$ntoken_en)

# > min(dat_full$ntoken_en)
                             # [1] 6
# > max(dat_full$ntoken_en)
                             # [1] 59

# check number of manifesto IDs after and before removing short + long sentences
set.seed(235)
length(unique(dat_raw$manifesto_id))

length(unique(dat_full$manifesto_id))
# number of manifestos remained the same

# check distribution of CMP-annotated and not-annotated sentences
table(dat_full$type)


# number of countries
length(unique(dat_full$countryname))
                             # [1] 24


# create a sentence IDs
dat_full <- dat_full |>
    group_by(manifesto_id) |>
    mutate(doc_id = paste(manifesto_id, 1:n())) |>
    ungroup()

# store for BERT prediction in Python
dat_full_bert <- dat_full |>
    select(text, manifesto_id, doc_id)

# divide up data into six data frames for efficient BERT classification
# dat_full_bert1 <- dat_full_bert[1:200000,]
# dat_full_bert2 <- dat_full_bert[200001:400000,]
# dat_full_bert3 <- dat_full_bert[400001:600000,]
# dat_full_bert4 <- dat_full_bert[600001:800000,]
# dat_full_bert5 <- dat_full_bert[800001:1000000,]
# dat_full_bert6 <- dat_full_bert[1000000:nrow(dat_full_bert),]

# write.csv(dat_full_bert1, "data_bert_unlabeled/dat_full_bert1.csv",
#           fileEncoding = "utf-8", row.names = FALSE)
# head(dat_full_bert1$text, 2)
#
# write.csv(dat_full_bert2, "data_bert_unlabeled/dat_full_bert2.csv",
#           fileEncoding = "utf-8", row.names = FALSE)
# head(dat_full_bert2$text, 2)
#
# write.csv(dat_full_bert3, "data_bert_unlabeled/dat_full_bert3.csv",
#           fileEncoding = "utf-8", row.names = FALSE)
# head(dat_full_bert3$text, 2)
#
# write.csv(dat_full_bert4, "data_bert_unlabeled/dat_full_bert4.csv",
#           fileEncoding = "utf-8", row.names = FALSE)
# head(dat_full_bert4$text, 2)
#
# write.csv(dat_full_bert5, "data_bert_unlabeled/dat_full_bert5.csv",
#           fileEncoding = "utf-8", row.names = FALSE)
# head(dat_full_bert5$text, 2)
#
# write.csv(dat_full_bert6, "data_bert_unlabeled/dat_full_bert6.csv",
#           fileEncoding = "utf-8", row.names = FALSE)
# head(dat_full_bert6$text, 2)

# create a quanteda corpus
corp <- corpus(dat_full, text_field = "text")

# use 10 threads to speed up process (un-comment if not needed or possible)
quanteda_options("threads" = 10)

# load nostalgia dictionary
dict_full <- readRDS("dictionary_full.rds")

summary(dict_full)
dict_full

# tokenise corpus: lowercase and remove stopwords
toks_manifestos_en <- corp |>
    tokens() |>
    tokens_tolower() |>
    tokens_remove(pattern = stopwords("en"))

ndoc(toks_manifestos_en)

# look up nostalgic terms
dfmat_lookup_nostalgic <- toks_manifestos_en |>
    tokens_lookup(dictionary = dict_full) |>
    dfm() |>
    quanteda::convert(to = "data.frame")

# inspect data frame
head(dfmat_lookup_nostalgic)

# names of dictionary keys in data frame
names(dfmat_lookup_nostalgic)

# estimate sentence-level sentiment for additional nostalgia condition
# using the Lexicoder Sentiment Dictionary
dfmat_lookup_sentiment <- toks_manifestos_en |>
    tokens_lookup(
        dictionary = data_dictionary_LSD2015,
        nested_scope = "dictionary"
    ) |>
    dfm() |>
    quanteda::convert(to = "data.frame") |>
    dplyr::select(positive, negative, neg_positive, neg_negative)


# bind the results from the embeddings dictionary and the
# basic dictionary
dfmat_lookup <- bind_cols(
    dfmat_lookup_nostalgic,
    dfmat_lookup_sentiment
)


# classify data using SVM

# load training data
dat_train <- read.csv("data_coded_all.csv",
                      fileEncoding = "utf-8",
                      stringsAsFactors = FALSE
)

# create corpus with training data
corp_train <- corpus(dat_train, text = "text")

# compound dictionary terms
toks_train <- tokens(corp_train) |>
    tokens_compound(pattern = phrase(dict_full))

# create dfm
dfmat_train <- dfm(toks_train)

# look up dictionary terms
dfmat_manifestos_en <- toks_manifestos_en |>
    tokens_compound(pattern = phrase(dict_full)) |>
    dfm()

# train SVM classifier
tmod_svm <- textmodel_svm(dfmat_train,
                          y = dfmat_train$nostalgic
)

# match test and training set
dfmat_matched <- dfm_match(dfmat_manifestos_en, features = featnames(dfmat_train))

# predict all sentences
pred <- predict(tmod_svm, dfmat_matched)

table(pred)

# add predicted class as "nostalgia_svm"
dfmat_lookup$nostalgia_svm <- pred

# get sum of nostalgic terms for dictionary
dfmat_lookup <- dfmat_lookup |>
    mutate(nostalgia_sum = nostalgia.nostalgia_davalos +
               nostalgia.nostalgia_extensions +
               nostalgia.nostalgia_handcoding) |>
    mutate(nostalgia_sum_emb = nostalgia_sum + nostalgia_emb)


## estimate sentiment using the Proksch et al. (2019) aggregation
dfmat_lookup <- dfmat_lookup |>
    mutate(sentiment = log((positive + neg_negative + 0.5) / (negative + neg_positive + 0.5))) |>
    mutate(net_positive = ifelse(sentiment > 0, TRUE, FALSE)) # check whether dictionary is "net positive"


# select only terms from nostalgia dictionary
toks_sel_manifestos_en <- tokens_keep(toks_manifestos_en,
                                      pattern = phrase(dict_full)
) |>
    tokens_compound(
        pattern = phrase(dict_full),
        concatenator = " "
    )


# select only terms from nostalgia dictionary (FOR EMBEDDINGS)
toks_sel_manifestos_full <- tokens_keep(toks_manifestos_en,
                                        pattern = phrase(dict_full)
) |>
    tokens_compound(
        pattern = phrase(dict_full),
        concatenator = " "
    )


# get most frequent terms across the entire corpus
tstat_freq_manifestos_full <- toks_sel_manifestos_full |>
    dfm() |>
    textstat_frequency(n = 10000)


# repeat for embeddings
toks_sel_manifestos_emb <- tokens_keep(toks_manifestos_en,
                                       pattern = phrase(dict_full$nostalgia_emb)
) |>
    tokens_compound(
        pattern = phrase(dict_full$nostalgia_emb),
        concatenator = " "
    )



# bind nostalgia estimates with dat_full
dat_classified <- bind_cols(dfmat_lookup, dat_full)


# load classifications from BERT models

dat_bert_classified_1 <- read.csv("data_classified_bert_1.csv",
                                  stringsAsFactors = FALSE, fileEncoding = "utf-8"
)

dat_bert_classified_2 <- read.csv("data_classified_bert_2.csv",
                                  stringsAsFactors = FALSE, fileEncoding = "utf-8"
)

dat_bert_classified_3 <- read.csv("data_classified_bert_3.csv",
                                  stringsAsFactors = FALSE, fileEncoding = "utf-8"
)

dat_bert_classified_4 <- read.csv("data_classified_bert_4.csv",
                                  stringsAsFactors = FALSE, fileEncoding = "utf-8"
)

dat_bert_classified_5 <- read.csv("data_classified_bert_5.csv",
                                  stringsAsFactors = FALSE, fileEncoding = "utf-8"
)


dat_bert_classified_6 <- read.csv("data_classified_bert_6.csv",
                                  stringsAsFactors = FALSE, fileEncoding = "utf-8"
)


# bind all BERT predictions as a long data frame
dat_bert_joined <- bind_rows(
    dat_bert_classified_1,
    dat_bert_classified_2,
    dat_bert_classified_3,
    dat_bert_classified_4,
    dat_bert_classified_5,
    dat_bert_classified_6
)


# keep only relevant variables
# (keep text to ensure that correct predictions are merged by doc_id)
dat_bert_joined_select <- dat_bert_joined |>
    select(nostalgic_bert, doc_id, text_bert = text)



# rename doc_id for merging
dat_classified <- dat_classified |>
    mutate(doc_id = doc_id...1)

length(unique(dat_classified$doc_id))

# check for differences in doc_ids
setdiff(dat_classified$doc_id, dat_bert_joined_select$doc_id)

# these sentences were invalid for the BERT classification

# merge datasets
dat_classified_with_bert <- left_join(dat_classified,
                                      dat_bert_joined_select,
                                      by = "doc_id"
)

# get the sentences not included in BERT classifiers due to invalid characters
setdiff(dat_classified_with_bert$text, dat_classified_with_bert$text_bert)

dat_classified_with_bert |>
    filter(is.na(nostalgic_bert)) |>
    nrow()

# check for NA (6 sentences were not classified by transformer since too short/empty)

table(dat_classified_with_bert$nostalgic_bert,
      useNA = "always"
)

# 6 rows coded as NA - change them to 0.
# Otherwise, entire manifesto classified as NA

dat_classified_with_bert <- dat_classified_with_bert |>
    mutate(nostalgic_bert = ifelse(is.na(nostalgic_bert), 0, nostalgic_bert))

# check again
table(dat_classified_with_bert$nostalgic_bert,
      useNA = "always"
)

# get information on nostalgic sentences for each measure

# filter only sentences with CMP codes
dat_classified_cmp <- filter(dat_classified_with_bert, !is.na(cmp_code))

length(unique(dat_classified_with_bert$manifesto_id))

length(unique(dat_classified_cmp$manifesto_id))


# get number of quasi-sentence annotated manifesto sentences
dat_classified_cmp |>
    ungroup() |>
    mutate(cmp_clean = substr(cmp_code, 1, 3)) |>
    filter(!is.na(cmp_clean)) |>
    nrow()

# Recode party families and sentence-level nostalgia
dat_classified_cmp_long <- dat_classified_cmp |>
    mutate(cmp_clean = substr(cmp_code, 1, 3)) |>
    filter(!is.na(cmp_clean)) |>
    mutate(party_family_recoded = dplyr::recode(parfam,
                                                "10" = "Ecological",
                                                "20" = "Socialist",
                                                "30" = "Social Democratic",
                                                "40" = "Liberal",
                                                "50" = "Christian Democratic",
                                                "60" = "Conservative",
                                                "70" = "Nationalist",
                                                "80" = "Other", # agrarian
                                                "90" = "Other", # ethnic and regional
                                                "95" = "Other", # special issue
                                                "98" = "Other"
    )) |> # alliances
    mutate(nostalgia_sentence_bert = nostalgic_bert) |>
    mutate(nostalgia_sentence_dummy = ifelse(nostalgia_sum > 0, 1, 0)) |>
    mutate(nostalgia_sentence_dummy_emb = ifelse(nostalgia_sum_emb > 0, 1, 0)) |>
    mutate(nostalgia_sentence_dummy_sentiment = ifelse(nostalgia_sum > 0 & net_positive == TRUE, 1, 0)) |>
    mutate(nostalgia_sentence_dummy_sentiment_emb = ifelse(nostalgia_sum_emb > 0 & net_positive == TRUE, 1, 0)) |>
    mutate(nostalgia_sentence_svm_or_bert = ifelse(nostalgia_svm + nostalgia_sentence_bert >= 1, 1, 0)) |>
    select(starts_with("nostalgia_sentence"), party, countryname, date, party_family_recoded, nostalgia_svm, doc_id, cmp_clean) |>
    gather(measure, nostalgic_dummy, -c(doc_id, party, countryname, date, cmp_clean, party_family_recoded)) |>
    mutate(cmp_clean = ifelse(cmp_clean == "000", "0", cmp_clean))


summary(dat_classified_cmp_long)
table(dat_classified_cmp_long$measure)

# create dummy variables for nostalgia on sentence level

dat_classified_with_bert <- dat_classified_with_bert |>
    group_by(party, date) |>
    rename(nostalgia_sentence_bert = nostalgic_bert,
           nostalgia_sentence_svm = nostalgia_svm) |>
    mutate(nostalgia_sentence_dummy = ifelse(nostalgia_sum > 0, 1, 0)) |>
    mutate(nostalgia_sentence_dummy_emb = ifelse(nostalgia_sum_emb > 0, 1, 0)) |>
    mutate(nostalgia_sentence_dummy_sentiment = ifelse(nostalgia_sum > 0 & net_positive == TRUE, 1, 0)) |>
    mutate(nostalgia_sentence_dummy_sentiment_emb = ifelse(nostalgia_sum_emb > 0 & net_positive == TRUE, 1, 0)) |>
    mutate(nostalgia_sentence_svm_or_bert = ifelse(nostalgia_sentence_svm + nostalgia_sentence_bert >= 1, 1, 0)) |>
    mutate(nostalgia_sentence_ml_and_emb = ifelse(nostalgia_sentence_svm_or_bert + nostalgia_sentence_dummy_emb == 2, 1, 0)) |>
    mutate(nostalgia_ensemble_count = nostalgia_sentence_bert + nostalgia_sentence_svm +
               nostalgia_sentence_dummy + nostalgia_sentence_dummy_emb + nostalgia_sentence_dummy_sentiment +
               nostalgia_sentence_dummy_sentiment_emb) |>
    mutate(n_sentences_manifesto = n()) |>
    mutate(nostalgia_sentences_per_1000_svm = 1000 * mean(nostalgia_sentence_svm)) |>
    mutate(nostalgia_sentences_per_1000 = 1000 * mean(nostalgia_sentence_dummy)) |>
    mutate(nostalgia_sentences_per_1000_bert = 1000 * mean(nostalgia_sentence_bert)) |>
    mutate(nostalgia_sentences_per_1000_emb = 1000 * mean(nostalgia_sentence_dummy_emb)) |>
    mutate(nostalgia_sentences_per_1000_sentiment = 1000 * mean(nostalgia_sentence_dummy_sentiment)) |>
    mutate(nostalgia_sentences_per_1000_sentiment_emb = 1000 * mean(nostalgia_sentence_dummy_sentiment_emb)) |>
    mutate(nostalgia_sentences_per_1000_svm_or_bert = 1000 * mean(nostalgia_sentence_svm_or_bert)) |>
    mutate(nostalgia_sentences_per_1000_ml_and_emb = 1000 * mean(nostalgia_sentence_ml_and_emb)) |>
    mutate(
        nost_sents_sum_base = sum(nostalgia_sentence_dummy),
        nost_sents_sum_emb = sum(nostalgia_sentence_dummy_emb),
        nost_sents_sum_svm = sum(nostalgia_sentence_svm),
        nost_sents_sum_bert = sum(nostalgia_sentence_bert),
        nost_sents_sum_ml_and_emb = sum(nostalgia_sentence_ml_and_emb),
        nost_sents_sum_svm_or_bert = sum(nostalgia_sentence_svm_or_bert),
        nost_sents_sum_emb_pos = sum(nostalgia_sentence_dummy_sentiment_emb),
        nost_sents_sum_pos = sum(nostalgia_sentence_dummy_sentiment)
    ) |>
    mutate(nostalgia_sum_manifesto = sum(nostalgia_sum, na.rm = TRUE)) |>
    mutate(nostalgia_sum_manifesto_emb = sum(nostalgia_sum_emb, na.rm = TRUE)) |>
    mutate(nostalgia_sum_manifesto_bert = sum(nostalgia_sentence_bert, na.rm = TRUE)) |>
    mutate(nostalgia_sum_manifesto_svm = sum(nostalgia_sentence_svm, na.rm = TRUE)) |>
    ungroup()

# check for missing observations
dat_classified_with_bert |>
    filter(is.na(nostalgia_sentence_bert)) |>
    nrow()
# no missing observations

# select relevant variables for PolNos dataset V1
dat_classified_with_bert_polnos <- dat_classified_with_bert |>
    mutate(doc_id = paste0("doc_", 1:n())) |>
    select(manifesto_id, type, text,
           doc_id, countryname, party,
           edate,
           pos, cmp_code,
           ntoken = ntoken_en,
           sentiment,
           nostalgia_sentence_dummy,
           nostalgia_sentence_dummy_emb,
           nostalgia_sentence_dummy_sentiment,
           nostalgia_sentence_dummy_sentiment_emb,
           nostalgia_sentence_bert,
           nostalgia_sentence_svm,
           nostalgia_sentence_svm_or_bert,
           nostalgia_sentence_ml_and_emb,
           nostalgia_ensemble_count
    )
# saveRDS(dat_classified_with_bert_polnos, "data_polnos_sentencelevel.rds")


# get the most frequent dictionary matches
dat_topfreq_long <- dat_classified_with_bert |>
    mutate(doc_id = paste0("doc_", 1:n())) |>
    select(
        text, nostalgia_sentence_dummy,
        nostalgia_sentence_dummy_emb,
        nostalgia_sentence_dummy_sentiment,
        nostalgia_sentence_dummy_sentiment_emb,
        countryname
    ) |>
    gather(measure, nostalgia_dummy, -c(text, countryname))


measures_topfreq <- c(
    "nostalgia_sentence_dummy",
    "nostalgia_sentence_dummy_emb",
    "nostalgia_sentence_dummy_sentiment",
    "nostalgia_sentence_dummy_sentiment_emb"
)


# go through all dictionary measures, keep only text that contains
# nostalgic rhetoric (according to dictionary),
# then keep terms from the dictionary,
# compound terms, calculate frequencies

# across measures
dat_topfeats_all <- data.frame()

# for each country
dat_topfeats_country <- data.frame()

for (i in measures_topfreq) {
    cat("Frequency analysis for", i, "\n")
    
    dat_filtered_measure <- dat_topfreq_long |>
        filter(measure == i) |>
        filter(nostalgia_dummy == 1)
    
    toks_topfreq <- dat_filtered_measure |>
        corpus(text_field = "text") |>
        tokens() |>
        tokens_keep(pattern = dict_full) |> # keep only dictionary matches
        tokens_compound(pattern = phrase(dict_full)) # compoud words
    
    # get all matches and their frequencies from dictionary
    tstat_topfreq <- toks_topfreq |>
        dfm() |>
        textstat_frequency() |>
        as.data.frame()
    
    # assign measure to data frame
    tstat_topfreq$measure <- i
    
    # bind data frame
    dat_topfeats_all <- bind_rows(tstat_topfreq, dat_topfeats_all)
    
    tstat_topfreq_country <- toks_topfreq |>
        dfm() |>
        textstat_frequency(groups = countryname) |>
        as.data.frame()
    
    # assign measure to data frame
    tstat_topfreq_country$measure <- i
    
    # bind data frame
    dat_topfeats_country <- bind_rows(dat_topfeats_country, tstat_topfreq_country)
}

## repeat for each country
dat_topfeats_all_country_tab <- dat_topfeats_country |>
    filter(rank <= 5) |>
    mutate(frequency = str_squish(format(frequency, big.mark = ","))) |>
    mutate(feature_freq = paste0(feature, " (", frequency, ")")) |>
    group_by(group, measure) |>
    summarise(`Terms and Phrases` = paste(feature_freq, collapse = ", "))


dat_topfeats_all_country_tab <- dat_topfeats_all_country_tab |>
    mutate(measure = str_remove_all(measure, "nostalgia_sentence_")) |>
    mutate(measure = dplyr::recode(measure,
                                   "dummy" = "Dictionary",
                                   "dummy_sentiment" = "Dictionary + Sentiment",
                                   "dummy_sentiment_emb" = "Dictionary + Emb. + Sentiment",
                                   "dummy_emb" = "Dictionary + Emb."
    )) |>
    filter(measure == "Dictionary") |>
    select(-measure) |>
    rename(Country = group)


# Table A5 ----
print(
    xtable(dat_topfeats_all_country_tab,
           digits = 1,
           caption = "Frequencies of dictionary entries (base dictionary) for sentences classified as nostalgic, separately for each country. Frequencies in parentheses. Table lists the five most frequent terms/phrases.",
           label = "tab:freq_country",
           align = c(
               "p{0.03\\textwidth}",
               "p{0.18\\textwidth}",
               "p{0.8\\textwidth}"
           )
    ),
    type = "latex",
    digits = 1,
    size = "footnotesize",
    file = "tab_a05.tex",
    include.rownames = FALSE,
    caption.placement = "top"
)



# keyness analysis for each of the six measures
dat_keyness_all <- dat_classified_with_bert |>
    mutate(doc_id = paste0("doc_", 1:n())) # create sentence ID

# tokenize corpus for keyness analysis
toks_keyness <- dat_keyness_all |>
    corpus(text_field = "text") |>
    tokens(remove_punct = TRUE, remove_symbols = TRUE) |>
    tokens_remove(pattern = stopwords("en"))

# select all six measures to be used for analysis
measures <- c(
    "nostalgia_sentence_svm",
    "nostalgia_sentence_bert",
    "nostalgia_sentence_dummy",
    "nostalgia_sentence_dummy_emb",
    "nostalgia_sentence_dummy_sentiment",
    "nostalgia_sentence_dummy_sentiment_emb"
)


# run keyness analysis in a loop and store terms
# in dat_keyness_words

# empty data frame
dat_keyness_words <- data.frame()

for (i in measures) {
    cat("Keyness analysis for", i, "\n")
    
    # create dfm and group by measure i
    dfmat_grouped <- toks_keyness |>
        dfm() |>
        dfm_group(groups = docvars(toks_keyness, i))
    
    # run keyness analysis
    keyness <- textstat_keyness(dfmat_grouped, target = "1")
    
    # create variable indicating the measure
    keyness$measure <- i
    
    # bind data frame
    dat_keyness_words <- bind_rows(keyness, dat_keyness_words)
}


# get the top-50 terms per measure based on rank
dat_keyness_words_max_50 <- dat_keyness_words |>
    group_by(measure) |>
    mutate(rank = 1:n()) |>
    filter(rank <= 50)



# bind terms for each measure int one row
dat_keyness_words_max_50_tab <- dat_keyness_words_max_50 |>
    group_by(measure) |>
    summarise(Words = paste(feature, collapse = ", "))


# recode measures for table
dat_keyness_words_max_50_tab <- dat_keyness_words_max_50_tab |>
    mutate(measure = str_remove_all(measure, "nostalgia_sentence_")) |>
    mutate(measure = dplyr::recode(measure,
                                   "bert" = "DistilBERT",
                                   "dummy" = "Dictionary",
                                   "dummy_sentiment" = "Dictionary + Sentiment",
                                   "dummy_sentiment_emb" = "Dictionary + Embeddings + Sentiment",
                                   "dummy_emb" = "Dictionary + Embeddings",
                                   "sentiment_emb" = "Dictionary + Sentiment + Embeddings",
                                   "svm" = "SVM"
    )) |>
    rename(Measure = measure) |>
    mutate(Measure = factor(Measure, levels = c(
        "Dictionary",
        "Dictionary + Embeddings",
        "Dictionary + Sentiment",
        "Dictionary + Embeddings + Sentiment",
        "SVM", "DistilBERT"
    ))) |>
    arrange(Measure)


# Table A6 ----
print(
    xtable(dat_keyness_words_max_50_tab,
           digits = 1,
           caption = "Keyness analysis for sentences classified as nostalgic, separately for each method. Table lists the 50 most important features distinguishing `nostalgic' sentences (target category) from sentences classified as `not nostalgic' (reference category).",
           label = "tab:keyness",
           align = c(
               "p{0.03\\textwidth}",
               "p{0.3\\textwidth}",
               "p{0.7\\textwidth}"
           )
    ),
    type = "latex",
    digits = 1,
    size = "footnotesize",
    file = "tab_a06.tex",
    include.rownames = FALSE,
    caption.placement = "top"
)

# use features with at least 5 occurrences, group by measure, and change
# unit of observation to measure
dat_topfeats_all_tab <- dat_topfeats_all |>
    filter(frequency >= 5) |> # only keep matches that appear at least five times
    mutate(frequency = str_squish(format(frequency, big.mark = ","))) |>
    mutate(feature_freq = paste0(feature, " (", frequency, ")")) |>
    group_by(measure) |>
    summarise(`Terms and Phrases` = paste(feature_freq, collapse = ", "))


# tidy up names of measures
dat_topfeats_all_tab <- dat_topfeats_all_tab |>
    mutate(measure = str_remove_all(measure, "nostalgia_sentence_")) |>
    mutate(measure = dplyr::recode(measure,
                                   "dummy" = "Dictionary",
                                   "dummy_sentiment" = "Dictionary + Sent.",
                                   "dummy_sentiment_emb" = "Dictionary + Emb. + Sent.",
                                   "dummy_emb" = "Dictionary + Emb."
    )) |>
    rename(Measure = measure)


# Table A7 ----
print(
    xtable(dat_topfeats_all_tab,
           digits = 1,
           caption = "Frequencies of dictionary entries for sentences classified as nostalgic, separately for each dictionary. Frequencies in parentheses. Table lists terms and phrases that appear at least five times in sentences classified as nostalgic.",
           label = "tab:freq",
           align = c(
               "p{0.03\\textwidth}",
               "p{0.1\\textwidth}",
               "p{0.8\\textwidth}"
           )
    ),
    type = "latex",
    digits = 1,
    size = "footnotesize",
    file = "tab_a07.tex",
    include.rownames = FALSE,
    caption.placement = "top"
)


# # get all CMP codes for manual recoding into broader categories
# dat_classified_cmp_nostalgia_count <- dat_classified_cmp_nostalgia |>
#     select(cmp_clean) |>
#     unique() |>
#     arrange(cmp_clean)
# write_csv(dat_classified_cmp_nostalgia_count, "data_cmp_codes_notclassified.csv")

# load spreadsheet with CMP codes (economic/cultural/neither)
dat_codes <- read.csv("data_cmp_codes.csv", sep = ";")

# calculate the average nostalgia across the three policy groups
dat_mean_nostalgia <- dat_classified_cmp_long |>
    left_join(dat_codes) |> # merge spreadsheet
    mutate(measure = str_remove_all(measure, "nostalgia_sentence_")) |>
    mutate(measure = dplyr::recode(measure,
                                   "bert" = "DistilBERT",
                                   "dummy" = "Dictionary",
                                   "dummy_sentiment" = "Dictionary + Sentiment",
                                   "dummy_sentiment_emb" = "Dictionary + Embeddings + Sentiment",
                                   "dummy_emb" = "Dictionary + Embeddings",
                                   "sentiment_emb" = "Dictionary + Sentiment + Embeddings",
                                   "nostalgia_svm" = "SVM"
    )) |>
    group_by(
        measure,
        type_policy_area, group
    ) |>
    summarise(
        mean_nostalgia_policy = mean(nostalgic_dummy, na.rm = TRUE),
        n_nostalgic = sum(nostalgic_dummy, na.rm = TRUE),
        n_category = n()
    ) |>
    group_by(group, type_policy_area) |>
    summarise(
        mean_nostalgia = mean(mean_nostalgia_policy),
        min_nostalgia = min(mean_nostalgia_policy),
        max_nostalgia = max(mean_nostalgia_policy),
        sd_measure = sd(mean_nostalgia_policy)
    ) |>
    arrange(-mean_nostalgia)


# get sum and average of nostalgia across policy groups
dat_mean_nostalgia_sep <- dat_classified_cmp_long |>
    left_join(dat_codes) |>
    mutate(measure = str_remove_all(measure, "nostalgia_sentence_")) |>
    mutate(measure = dplyr::recode(measure,
                                   "bert" = "DistilBERT",
                                   "dummy" = "Dictionary",
                                   "dummy_sentiment" = "Dictionary + Sentiment",
                                   "dummy_sentiment_emb" = "Dictionary + Embeddings + Sentiment",
                                   "dummy_emb" = "Dictionary + Embeddings",
                                   "sentiment_emb" = "Dictionary + Sentiment + Embeddings",
                                   "nostalgia_svm" = "SVM"
    )) |>
    group_by(
        measure,
        # countryname, party, date,
        type_policy_area, group
    ) |>
    summarise(
        mean_nostalgia_policy = mean(nostalgic_dummy),
        n_nostalgic = sum(nostalgic_dummy),
        n_category = n()
    )


# create variable for measure (ML vs Dictionary)
dat_mean_nostalgia_sep <- dat_mean_nostalgia_sep |>
    mutate(type = ifelse(measure %in% c("SVM", "DistilBERT"),
                         "Machine Learning", "Dictionary"
    ))

# merge data frames
dat_classified_cmp_long_merged <- left_join(dat_classified_cmp_long, dat_codes,
                                            by = "cmp_clean"
)


# get proportions of nostalgia by measure
dat_classified_cmp_nostalgic_long_merged <- dat_classified_cmp_long_merged |>
    filter(nostalgic_dummy == 1) |>
    mutate(measure = str_remove_all(measure, "nostalgia_sentence_")) |>
    filter(measure != "svm_or_bert") |> # remove SVM or BERT measure
    mutate(measure = dplyr::recode(measure,
                                   "bert" = "DistilBERT",
                                   "dummy" = "Dictionary",
                                   "dummy_sentiment" = "Dictionary + Sentiment",
                                   "dummy_sentiment_emb" = "Dictionary + Embeddings + Sentiment",
                                   "dummy_emb" = "Dictionary + Embeddings",
                                   "sentiment_emb" = "Dictionary + Sentiment + Embeddings",
                                   "nostalgia_svm" = "SVM"
    ))

# get proportion of nostalgia for all classifiers for the three types of areas
dat_nostalgia_count <- dat_classified_cmp_nostalgic_long_merged |>
    group_by(measure, type_policy_area_broad) |>
    count() |>
    ungroup() |>
    group_by(measure) |>
    mutate(prop = n / sum(n))


dat_nostalgia_policy <- dat_classified_cmp_nostalgic_long_merged |>
    mutate(measure = str_replace_all(measure, "Dictionary", "Dict.")) |>
    mutate(measure = str_replace_all(measure, "Embeddings", "Emb.")) |>
    mutate(measure = str_replace_all(measure, "Sentiment", "Sent.")) |>
    group_by(measure, group, type_policy_area_broad) |>
    count() |>
    ungroup() |>
    group_by(measure) |>
    mutate(perc = round(100 * n / sum(n), 1)) |>
    select(-n) |>
    mutate(perc = ifelse(is.na(perc), 0, perc)) |>
    spread(measure, perc)

# replace NA with 0
dat_nostalgia_policy[is.na(dat_nostalgia_policy)] <- 0

names(dat_nostalgia_policy)

# get average across all measures
row_means <- select(dat_nostalgia_policy, `Dict.`:SVM)

# bind with data frame
dat_nostalgia_policy$Mean <- round(rowMeans(row_means), 1)

# select and rename variables
dat_nostalgia_policy_clean <- dat_nostalgia_policy |>
    arrange(-Mean) |>
    select(
        `Policy Area` = group,
        `Type` = type_policy_area_broad,
        Mean, `Dict.`:SVM
    ) |>
    rename(BERT = DistilBERT)


names(dat_nostalgia_policy_clean)

# reorder columns
dat_nostalgia_policy_clean <- dat_nostalgia_policy_clean |>
    select(
        `Policy Area`, Type,
        Mean, `Dict.`, `Dict. + Emb.`,
        `Dict. + Sent.`, `Dict. + Emb. + Sent.`,
        SVM, BERT
    )

names(dat_nostalgia_policy_clean)


# Table A15 ----
print(
    xtable(dat_nostalgia_policy_clean,
           digits = 1,
           caption = "Prevalance (\\%) of policy areas in sentences classified as nostalgic",
           label = "tab:nostalgia_policyareas",
           align = c(
               "p{0.03\\textwidth}",
               "p{0.4\\textwidth}",
               "p{0.1\\textwidth}",
               "p{0.05\\textwidth}",
               "p{0.05\\textwidth}",
               "p{0.05\\textwidth}",
               "p{0.05\\textwidth}",
               "p{0.05\\textwidth}",
               "p{0.05\\textwidth}",
               "p{0.05\\textwidth}"
           )
    ),
    type = "latex",
    digits = 1,
    size = "footnotesize",
    file = "tab_a15.tex",
    include.rownames = FALSE,
    caption.placement = "top"
)


table(dat_nostalgia_count$measure)

# make sure all values add up to 1
dat_nostalgia_count |>
    group_by(measure) |>
    summarise(sum = sum(prop))

# get percentages in nice format
dat_nostalgia_count <- dat_nostalgia_count |>
    mutate(label = paste0(round(prop * 100, 0), "%", " (", str_squish(format(n, big.mark = ",")), ")"))


# Figure A12 ----
ggplot(
    dat_nostalgia_count,
    aes(
        x = forcats::fct_rev(type_policy_area_broad), y = prop,
        fill = type_policy_area_broad
    )
) +
    geom_bar(stat = "identity", colour = "black") +
    geom_text(aes(label = label),
              size = 4.5,
              colour = "black", nudge_y = 0.02, hjust = 0
    ) +
    scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        breaks = c(seq(0, 1, 0.2)),
        limits = c(0, 1)
    ) +
    facet_wrap(~measure, nrow = 5) +
    coord_flip() +
    scale_fill_manual(values = c("black", "grey60", "white")) +
    labs(x = "Policy Area of Nostalgic Sentence", y = "Percentage of Nostalgic Sentences") +
    theme(legend.position = "none")
ggsave("fig_a12.pdf",
       width = 9, height = 5
)



# get proportions of nostalgic rhetoric by party family
dat_nostalgia_parfam_count <- dat_classified_cmp_nostalgic_long_merged |>
    group_by(
        measure,
        party_family_recoded, type_policy_area_broad
    ) |>
    count() |>
    ungroup() |>
    group_by(party_family_recoded, measure) |>
    mutate(prop = n / sum(n))


# select dictionary and BERT for plot
dat_nostalgia_parfam_count_subset <- dat_nostalgia_parfam_count |>
    filter(measure %in% c("DistilBERT", "Dictionary"))


# Figure A13 ----
ggplot(
    dat_nostalgia_parfam_count_subset,
    aes(x = forcats::fct_rev(party_family_recoded), y = prop)
) +
    geom_bar(stat = "identity") +
    facet_grid(measure ~ type_policy_area_broad) +
    coord_flip() +
    geom_text(aes(label = paste0(round(prop, digits = 2) * 100, "%")),
              nudge_y = 0.02, hjust = 0, colour = "black"
    ) +
    scale_y_continuous(
        labels = scales::percent_format(accuracy = 1),
        limits = c(0, 0.85), breaks = c(seq(0, 0.75, 0.25))
    ) +
    labs(x = "Party Family", y = "Percentage of Nostalgic Sentences")
ggsave("fig_a13.pdf",
       width = 9, height = 6
)



# get one observation per manifesto for manifesto-level analyses
dat_manifestolevel <- dat_classified_with_bert |> # select relevant variables
    dplyr::select(
        party, edate, date, parfam, partyname,
        countryname,
        nostalgia_sum_manifesto,
        nostalgia_sum_manifesto_emb,
        type,
        cabinet_status_lag,
        cabinet_status,
        starts_with("vote_share_cmp"),
        starts_with("nostalgia_sentences_per_1000"),
        starts_with("nost_sents_sum"),
        n_sentences_manifesto,
        pervote, presvote,
        rile, logrile,
        stateconomy,
        starts_with("loglibcons")
    ) |>
    unique()


# retrieve year from election date
dat_manifestolevel <- dat_manifestolevel |>
    mutate(year = lubridate::year(edate))

nrow(dat_manifestolevel)

# create election identifier
dat_manifestolevel <- dat_manifestolevel |>
    mutate(election_id = paste(countryname, edate, sep = "_"))

# create manifesto ID
dat_manifestolevel <- dat_manifestolevel |>
    mutate(manifesto_id = paste(party, edate, sep = "_"))

# decade of manifesto
dat_manifestolevel <- dat_manifestolevel |>
    mutate(decade = factor(paste0(substr(edate, 1, 3), "0")))

nrow(dat_manifestolevel)

# create better variable for party family based on CMP data
dat_manifestolevel <- dat_manifestolevel |>
    mutate(party_family = dplyr::recode(parfam,
                                        "10" = "Ecological",
                                        "20" = "Socialist",
                                        "30" = "Social democratic",
                                        "40" = "Liberal",
                                        "50" = "Christian democratic",
                                        "60" = "Conservative",
                                        "70" = "Nationalist",
                                        "80" = "Agrarian",
                                        "90" = "Ethnic and regional",
                                        "95" = "Special issue",
                                        "98" = "Electoral alliances"
    ))


# load comparative political dataset and get unemployment in year before election
dat_cpds <- haven::read_dta("CPDS_1960-2018_Update_2020.dta")

# select and rename relevant variables
dat_cpds_subset <- dat_cpds |>
    select(year, unemp,
           countryname = country,
           real_gdp_growth = realgdpgr,
           nominal_gdp_growth = nomgdpgr
    ) |>
    arrange(countryname, year) |>
    group_by(countryname) |>
    mutate(
        unemp_lag1 = lag(unemp),
        unemp_lag2 = lag(unemp, 2),
        unemp_change = unemp_lag1 - unemp_lag2,
        real_gdp_growth_lag1 = lag(real_gdp_growth),
        real_gdp_growth_lag2 = lag(real_gdp_growth, 2),
        nominal_gdp_growth_lag1 = lag(nominal_gdp_growth),
        nominal_gdp_growth_lag2 = lag(nominal_gdp_growth, 2)
    )


# merge manifesto-level data with CPDS information
dat_save <- left_join(dat_manifestolevel,
                      dat_cpds_subset,
                      by = c(
                          "countryname",
                          "year"
                      )
)

# check that number of observations remained the same
nrow(dat_manifestolevel)
nrow(dat_save)

# merge populism dataset
dat_populist <- rio::import("populist-version-2-20200626.xlsx")

nrow(dat_populist)

table(
    dat_populist$farright,
    dat_populist$populist
)

# select and rename relevant variables
dat_populist_select <- dat_populist |>
    dplyr::select(
        populism_popu_list = populist,
        populist_start:eurosceptic_bl,
        party = manifesto_id
    ) |>
    unique()



# merge with manifesto-level dataset
dat_save$party <- as.character(dat_save$party)
dat_save$party <- as.character(dat_save$party)
dat_populist_select$party <- as.character(dat_populist_select$party)

dat_save <- left_join(dat_save,
                      dat_populist_select,
                      by = "party"
)


# recode NA in populist to 0
dat_save <- dat_save |>
    mutate(populism_popu_list = ifelse(is.na(populism_popu_list), 0, populism_popu_list)) |>
    mutate(populist_dummy = ifelse(populism_popu_list == 1,
                                   "Populist Party", "Other Party"
    )) |>
    mutate(populist_dummy_precise = ifelse(populism_popu_list == 1 & edate >= populist_start,
                                           "Populist Party", "Other Party"
    ))


table(dat_save$populist_start)
table(dat_save$populist_end)

table(
    all = dat_save$populist_dummy,
    precise = dat_save$populist_dummy_precise
)

# recode categories
dat_save <- dat_save |>
    mutate(extremist_popu_list_categories = case_when(
        farright == 1 ~ "Far-right",
        farleft == 1 ~ "Far-left"
    )) |>
    mutate(extremist_popu_list_categories = ifelse(is.na(extremist_popu_list_categories),
                                                   "Other", extremist_popu_list_categories
    ))


table(dat_save$extremist_popu_list_categories)

# change factor levels
dat_save$extremist_popu_list_categories <- factor(dat_save$extremist_popu_list_categories,
                                                  levels = c("Other", "Far-right", "Far-left")
)


table(dat_save$extremist_popu_list_categories,
      useNA = "always"
)


# recode populist parties
dat_save <- dat_save |>
    mutate(populism_popu_list_categories = case_when(
        farright == 1 & populism_popu_list == 1 ~ "Populist Far-right",
        farleft == 1 & populism_popu_list == 1 ~ "Populist Far-left"
    )) |>
    mutate(populism_popu_list_categories = ifelse(is.na(populism_popu_list_categories),
                                                  "Other", populism_popu_list_categories
    ))


dat_save$populism_popu_list_categories <- factor(dat_save$populism_popu_list_categories,
                                                 levels = c("Other", "Populist Far-right", "Populist Far-left")
)

table(dat_save$extremist_popu_list_categories,
      useNA = "always"
)


table(dat_save$populism_popu_list)

dat_save$populism_popu_list <- factor(dat_save$populism_popu_list)

# number of countries
length(unique(dat_save$countryname))

# number of parties
length(unique(dat_save$party))

# minimum and maximum date
min(dat_save$edate)
max(dat_save$edate)

# read metadata on each manifesto to get party abbreviation
dat_meta <- readRDS("data_cmp_main.rds") |>
    select(party, edate, partyabbrev)

dat_save$edate <- as.character(dat_save$edate)
dat_meta$edate <- as.character(dat_meta$edate)

dat_save$party <- as.character(dat_save$party)
dat_meta$party <- as.character(dat_meta$party)

# merge data
dat_save_meta <- left_join(
    dat_save, dat_meta,
    by = c("party", "edate")
)


# recode party families
dat_save_meta <- dat_save_meta |>
    mutate(party_family_recoded = dplyr::recode(party_family,
                                                "Agrarian" = "Other",
                                                "Ethnic and regional" = "Other",
                                                "Special issue" = "Other"
    )) |>
    mutate(party_family_recoded = str_to_title(party_family_recoded))


# determine the region based on country names
countries_eastern <- c(
    "Bulgaria", "Czech Republic",
    "Estonia", "Hungary", "Latvia",
    "Lithuania", "Poland", "Slovenia"
)

countries_southern <- c(
    "Greece", "Italy", "Spain",
    "Portugal"
)

countries_nordic <- c(
    "Finland",
    "Iceland",
    "Norway",
    "Sweden",
    "Denmark"
)

countries_western <- c(
    "Austria", "Belgium",
    "France", "Germany",
    "Ireland", "Netherlands",
    "United Kingdom"
)





# create variable for region
dat_save_meta <- dat_save_meta |>
    mutate(region = case_when(
        countryname %in% countries_nordic ~ "Northern Europe",
        countryname %in% countries_eastern ~ "Central and Eastern Europe",
        countryname %in% countries_western ~ "Western Europe",
        countryname %in% countries_southern ~ "Southern Europe"
    ))


# remove variables that are not used at all in the analysis
names(dat_save_meta)

dat_save_meta_clean <- dat_save_meta |>
    # remove some of the variables
    select(-c(
        starts_with("change"), party_family, presvote, vote_share_cmp_lag2,
        starts_with("farright"),
        starts_with("farleft"), starts_with("euroscept"),
        populist_dummy_precise, extremist_popu_list_categories,
        populist_bl, populist_start, populist_end, pervote,
        loglibcons.imp, loglibcons_lag,
        nostalgia_sum_manifesto, nostalgia_sum_manifesto_emb,
        populist_startnobl, populist_endnobl
    )) |>
    select(
        election_id, manifesto_id, year, edate, date,
        party, parfam, partyname, partyabbrev, party_family_recoded,
        countryname, region,
        starts_with("unemp"), starts_with("real_gdp"), ## add economic variables
        type, cabinet_status, cabinet_status_lag,
        starts_with("popu"),
        everything()
    )

names(dat_save_meta_clean)


# detect missing values for cabinet_status, cabinet_status_lag and cabinet_status_lag2
# and recode manually
dat_save_meta_clean |>
    select(party, partyabbrev, edate, cabinet_status) |>
    filter(is.na(cabinet_status))

# PiS 2007-10-21: party not in government after election
# based on ParlGov data
dat_save_meta_clean <- dat_save_meta_clean |>
    mutate(cabinet_status = ifelse(
        party == "92436" & edate == "2007-10-21",
        "Opposition", cabinet_status
    ))

table(dat_save_meta_clean$cabinet_status)


# check cabinet_status_lag variables
dat_save_meta_clean |>
    select(party, partyabbrev, edate, cabinet_status_lag) |>
    filter(is.na(cabinet_status_lag))

# 53321: Social Democrats in Ireland
# before 2016-02-26 election: Did not exist

dat_save_meta_clean <- dat_save_meta_clean |>
    mutate(cabinet_status_lag = ifelse(
        party == "53321" & edate == "2016-02-26",
        "Did not exist", cabinet_status_lag
    ))

# 80330: ABV and 2014-10-05 election
# ABV did not exist before this election: therefore,
# cabinet_status_lag = "Did not exist"

dat_save_meta_clean <- dat_save_meta_clean |>
    mutate(cabinet_status_lag = ifelse(
        party == "80330" & edate == "2014-10-05",
        "Did not exist", cabinet_status_lag
    ))

# SYRIZA and 2004-03-07 election
# founded in 2004: https://en.wikipedia.org/wiki/Syriza
# therefore cabinet_status_lag = "Did not exist"

dat_save_meta_clean <- dat_save_meta_clean |>
    mutate(cabinet_status_lag = ifelse(
        party == "34020" & edate == "2004-03-07",
        "Did not exist", cabinet_status_lag
    ))


# LMP (86110) and 2010-04-11
# founded in 2009: https://en.wikipedia.org/wiki/LMP_–_Hungary%27s_Green_Party
# therefore cabinet_status_lag = "Did not exist"

dat_save_meta_clean <- dat_save_meta_clean |>
    mutate(cabinet_status_lag = ifelse(
        party == "86110" & edate == "2010-04-11",
        "Did not exist", cabinet_status_lag
    ))


# DK (86221) and 2014-04-06
# split from MSzP in 2014 - therefore: cabinet_status_lag = "Did not exist"
dat_save_meta_clean <- dat_save_meta_clean |>
    mutate(cabinet_status_lag = ifelse(
        party == "86221" & edate == "2014-04-06",
        "Did not exist", cabinet_status_lag
    ))

# PiS (92436) and 2007-10-21
# https://en.wikipedia.org/wiki/2005_Polish_parliamentary_election
# Party was party of PM after 2005 election
# Therfore recode cabinet_status_lag as Government (PM)

dat_save_meta_clean <- dat_save_meta_clean |>
    mutate(cabinet_status_lag = ifelse(
        party == "92436" & edate == "2007-10-21",
        "Government (PM)", cabinet_status_lag
    ))

# should have 0 observations since all
# missing information have been recoded manually
dat_save_meta_clean |>
    select(
        party, partyabbrev, edate,
        cabinet_status_lag
    ) |>
    filter(is.na(cabinet_status_lag)) |>
    nrow()

# recode cabinet_status_lag2
table(dat_save_meta_clean$cabinet_status_lag)

# harmonise Did not exist/Not represented in parliament
dat_save_meta_clean <- dat_save_meta_clean |>
    mutate(cabinet_status_lag = dplyr::recode(cabinet_status_lag,
                                              "Not represented in Parliament" = "Not represented in Parliament/Did not exist",
                                              "Did not exist" = "Not represented in Parliament/Did not exist"
    ))


# create binary variable for cabinet_status_lag
# Government if "Government" or "Government (PM)";
# Opposition (Not in government) for all other categories
dat_save_meta_clean <- dat_save_meta_clean |>
    mutate(cabinet_status_lag2 = ifelse(
        str_detect(cabinet_status_lag, "Government"),
        "Government", "Opposition"
    ))


# remove some variables for dataset used in subsequent analyses
dat_save_meta_clean_smaller <- dat_save_meta_clean |>
    mutate(nostalgia_main = nostalgia_sentences_per_1000_bert) |> # add DistilBERT-based variable as main measure
    dplyr::select(nostalgia_main, everything()) |>
    select(-c(
        nostalgia_sentences_per_1000_svm_or_bert,
        nostalgia_sentences_per_1000_ml_and_emb,
        nost_sents_sum_ml_and_emb,
        nost_sents_sum_svm_or_bert,
        starts_with("nostalgia_sentences_per_1000_en")
    )) |>
    ungroup()

# store manifesto-level dataset for analysis and PolNos V1
saveRDS(dat_save_meta_clean_smaller, "data_nostalgia_manifestolevel.rds")
# saveRDS(dat_save_meta_clean_smaller, "data_polnos_manifestolevel.rds")
